{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Demonstrating statsmodels Modin Interoperability\n",
    "###  Currently statsmodels is not completely interoperable with Modin. All the examples in this section are taken/ adapted from https://www.statsmodels.org/devel/gettingstarted.html or https://www.statsmodels.org/stable/index.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import statsmodels.api as sm\n",
    "import pandas\n",
    "import modin.pandas as pd\n",
    "from patsy import dmatrices"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Example with sm.OLS()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = sm.datasets.get_rdataset(\"Guerry\", \"HistData\").data\n",
    "modin_df = pd.DataFrame(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vars = ['Department', 'Lottery', 'Literacy', 'Wealth', 'Region']\n",
    "\n",
    "modin_df = modin_df[vars]\n",
    "\n",
    "modin_df[-5:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "modin_df = modin_df.dropna()\n",
    "\n",
    "modin_df[-5:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y, X = dmatrices('Lottery ~ Literacy + Wealth + Region', data=modin_df, return_type='dataframe')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = pd.DataFrame(y)\n",
    "X = pd.DataFrame(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mod = sm.OLS(y, X)    # Describe model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = mod.fit()       # Fit model\n",
    "\n",
    "print(res.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "sm.ols() is not interoperable with Modin currently."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Example with sm.ols(formula=)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "modin_df = pd.DataFrame({\"A\": [10,20,30,40,50], \"B\": [20, 30, 10, 40, 50], \"C\": [32, 234, 23, 23, 42523]})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import statsmodels.formula.api as sm\n",
    "result = sm.ols(formula=\"A ~ B + C\", data=modin_df).fit()\n",
    "print(result.params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(result.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Replicating statsmodels workflow with pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import statsmodels.api as sm\n",
    "\n",
    "df = sm.datasets.get_rdataset(\"Guerry\", \"HistData\").data\n",
    "pandas_df = pandas.DataFrame(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vars = ['Department', 'Lottery', 'Literacy', 'Wealth', 'Region']\n",
    "\n",
    "pandas_df = pandas_df[vars]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pandas_df = pandas_df.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y, X = dmatrices('Lottery ~ Literacy + Wealth + Region', data=df, return_type='dataframe')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = pandas.DataFrame(y)\n",
    "X = pandas.DataFrame(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mod = sm.OLS(y, X)    # Describe model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = mod.fit()       # Fit model\n",
    "\n",
    "print(res.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Example with sm.ols(formula=)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pandas_df = pd.DataFrame({\"A\": [10,20,30,40,50], \"B\": [20, 30, 10, 40, 50], \"C\": [32, 234, 23, 23, 42523]})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import statsmodels.formula.api as sm\n",
    "result = sm.ols(formula=\"A ~ B + C\", data=pandas_df).fit()\n",
    "print(result.params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(result.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "9752fa87da8bf164654ccc33a595e9110c8fc9bb15d763374a7037fd32519b1f"
  },
  "kernelspec": {
   "display_name": "Python 3.9.7 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
